import pandas as pd
from sklearn.metrics import mean_absolute_error
from random import shuffle
from gc import collect
from bokeh.charts import show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
output_notebook()
import warnings
warnings.filterwarnings("ignore")
def normalize(column):
norm_column = (column - column.mean()) / (column.max() - column.min())
return norm_column
def normalize_columns(data, columns):
df = data.copy()
for column in columns:
df[column] = normalize(df[column])
return df
# data = pd.read_csv("../data/train_complete_2016.csv")
# data.shape
train = pd.read_csv("../data/train.csv", index_col=0)
test = pd.read_csv("../data/test.csv", index_col=0)
# test_target = test["logerror"]
# train_target = train["logerror"]
# del test["logerror"]
# del train["logerror"]
train.head()
test.head()
numeric_columns = ['basementsqft', 'bathroomcnt', 'bedroomcnt',
'threequarterbathnbr', 'finishedfloor1squarefeet',
'calculatedfinishedsquarefeet',
'finishedsquarefeet6', 'finishedsquarefeet12', 'finishedsquarefeet13',
'finishedsquarefeet15', 'finishedsquarefeet50', 'fireplacecnt',
'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude',
'longitude', 'lotsizesquarefeet', 'poolsizesum', 'roomcnt', 'unitcnt',
'yardbuildingsqft17', 'yardbuildingsqft26', 'yearbuilt', 'taxamount',
'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt']
Plotting attribute correlation with the target column.
def plot_correlation(data, column_name, alpha=0.2, sample_ratio=1):
p = figure(plot_width=300, plot_height=300, title=column_name + " vs " + "logerror", tools=["xwheel_zoom", "xpan"])
n_sample = int(sample_ratio * len(data))
data_plot = data.sample(n_sample)
p.circle(data_plot[column_name], data_plot["logerror"], alpha=alpha)
return p
def correlations_df(data, column_name):
corr = data[column_name].corr(data["logerror"])
sign = '+' if corr >= 0 else '-'
corr = corr if corr >= 0 else corr * -1
return corr_df.append(pd.DataFrame([{"attribute": column_name, "sign": sign, "corr": corr}]))
corr_df = pd.DataFrame()
grid = [[]]
for column_name in numeric_columns:
corr_df = correlations_df(train, column_name)
p = plot_correlation(train, column_name, alpha=0.1, sample_ratio=0.25)
if len(grid[-1]) % 3 == 0:
grid.append([p])
else:
grid[-1].append(p)
show(gridplot(grid))
# corr_df.sort_values("corr", ascending=False)
Replacing missing values with column mode
def column_median(column):
return column.median()
data = train.append(test)
sample5 = train.sample(5)
sample5[numeric_columns]
for column_name in numeric_columns:
mode = column_median(data[column_name])
data[column_name] = data[column_name].fillna(mode)
sample5 = data.loc[sample5.index.tolist()]
sample5[numeric_columns]
for column in numeric_columns:
#columns must not contains nan
assert (False == (True in data[column].isnull().tolist()))
norm_data = normalize_columns(data, numeric_columns)
sample5 = norm_data.loc[sample5.index.tolist()]
sample5[numeric_columns]
data = norm_data
train = data.iloc[train.index.tolist()]
test = data.iloc[test.index.tolist()]
data = None
norm_data = None
collect()
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from numpy import median
def scorer(estimator, X, y):
pred = estimator.predict(X)
return mean_absolute_error(pred, y)
def scores(model, train, train_target, test, test_target, k, scorer=scorer):
trained_model = model.fit(train, train_target)
test_score = scorer(trained_model, test, test_target)
train_score = scorer(trained_model, train, train_target)
cv_scores = cross_val_score(model, train, train_target, cv=k, scoring=scorer)
return {"train": train_score, "test": test_score, "validation": median(cv_scores)}
skb = SelectKBest(k=10)
fit = skb.fit(train[numeric_columns], train["logerror"])
use_columns = train[numeric_columns].columns[fit.get_support()].tolist()
use_columns
results = []
k = 5
# train, test = partition(data[use_columns + ["logerror"]], train_proportion=0.7)
collect()
model = SGDRegressor(alpha=0.001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear SGDRegressor",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha0.001, max_iter=1000"})
model = SGDRegressor(alpha=0.0001, max_iter=2000)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear SGDRegressor",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha0.0001, max_iter=2000"})
model = SGDRegressor(alpha=0.0001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear SGDRegressor",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha0.0001, max_iter=1000"})
model = Ridge()
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"]})
model = Ridge(alpha=2.0)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha=2"})
model = Ridge(alpha=10.0)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha=10"})
model = Ridge(alpha=0.5)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha=0.5"})
model = Ridge(alpha=0.2)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "linear Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "alpha=0.2"})
from sklearn.preprocessing import PolynomialFeatures
def polynomial_features(df, degree=2):
poly = PolynomialFeatures(degree=degree)
polinomial_data = pd.DataFrame(poly.fit_transform(df))
polinomial_data.columns = poly.get_feature_names()
return polinomial_data
poly_test = polynomial_features(test[use_columns], degree=3)
poly_test.head()
poly_train = polynomial_features(train[use_columns], degree=3)
poly_train.head()
skb = SelectKBest(k=10)
fit = skb.fit(poly_train, train["logerror"])
use_poly_columns = poly_train.columns[fit.get_support()].tolist()
poly_train = poly_train[use_poly_columns]
print(poly_train.columns.tolist())
poly_train.head()
model = SGDRegressor(max_iter=2000)
scores_dict = scores(model, poly_train, train["logerror"],
poly_test[poly_train.columns.tolist()], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "Polynomial SGD Regressor",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "degree=4"})
model = Ridge()
scores_dict = scores(model, poly_train, train["logerror"],
poly_test[poly_train.columns.tolist()], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "Polynomial Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "degree=4"})
model = Ridge(alpha=0.5)
scores_dict = scores(model, poly_train, train["logerror"],
poly_test[poly_train.columns.tolist()], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "Polynomial Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "degree=4, alpha=0.5"})
model = Ridge(alpha=10)
scores_dict = scores(model, poly_train, train["logerror"],
poly_test[poly_train.columns.tolist()], test["logerror"], k, scorer=scorer)
scores_dict
results.append({"model": "Polynomial Ridge",
"score_test": scores_dict["test"],
"score_train": scores_dict["train"],
"score_cv": scores_dict["validation"],
"tags": "degree=4, alpha=10.0"})
results_df = pd.DataFrame(results)
results_df.sort_values("score_cv")
results_df.to_csv("results_df2.csv", index=False)
mlp_results = []
# [int(len(train[use_columns].columns)) * 1.5] * 10
for hidden_layer_sizes in [10,50,100,200,300,400]:
model = MLPRegressor(hidden_layer_sizes=[int(len(train[use_columns].columns) * 1.5)] * hidden_layer_sizes)
print("hidden_layer_sizes:", hidden_layer_sizes)
scores_dict = scores(model, train[use_columns], train["logerror"],
test[use_columns], test["logerror"], k, scorer=scorer)
mlp_results.append({"model": "linear MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
print(pd.DataFrame(mlp_results))
for hidden_layer_sizes in [10,50,100,200,300,400]:
model = MLPRegressor(hidden_layer_sizes=[int(len(poly_train.columns) * 1)] * hidden_layer_sizes)
print("hidden_layer_sizes:", hidden_layer_sizes)
scores_dict = scores(model, poly_train, train["logerror"],
poly_test[poly_train.columns.tolist()], test["logerror"], k, scorer=scorer)
mlp_results.append({"model": "polynomial MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
print(pd.DataFrame(mlp_results))
mlp_results_df = pd.DataFrame(mlp_results).sort_values("score_cv")
mlp_results_df["hidden_layer_sizes"] = mlp_results_df["tags"].apply(lambda value: int(value.split("=")[1]))
mlp_results_df = mlp_results_df.sort_values("hidden_layer_sizes")
mlp_results_df.sort_values("score_cv")
mlp_results_df.to_csv("mlp_results_df2.csv", index=False)
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot, row
output_notebook()
poli_mlp = mlp_results_df[mlp_results_df["model"] == "polynomial MLP"]
linear_mlp = mlp_results_df[mlp_results_df["model"] == "linear MLP"]
results_df.groupby("model")["score_cv", "score_test", "score_train"].apply(median)
results_df[results_df["model"].str.contains("SGD")]["score_train"].median()
# poli_mlp = poli_mlp.append(pd.DataFrame([{"model": "Polynomial SGD Regressor", "score_cv": 0.069465, "score_test": 0.069217, "score_train": 0.069406, "tags": "SGD", "hidden_layer_sizes": 0}]))
# linear_mlp = linear_mlp.append(pd.DataFrame([{"model": "Linear SGD Regressor", "score_cv": 0.069088, "score_test": 0.068942, "score_train": 0.069135, "tags": "SGD", "hidden_layer_sizes": 0}]))
# poli_mlp = poli_mlp.sort_values("hidden_layer_sizes")
# linear_mlp = linear_mlp.sort_values("hidden_layer_sizes")
def plot_mlp_scores(mlp_data, range_y, title, sgd_data=None):
p = figure(width=400, height=400, tools=["save", "xpan", "xwheel_zoom", "reset"],
x_axis_label = "n_layers", y_axis_label = "score", title=title, y_range=range_y)
p.line(x=mlp_data["hidden_layer_sizes"], y=mlp_data["score_train"],
line_width=2, color="red", legend="score_train")
p.line(x=mlp_data["hidden_layer_sizes"], y=mlp_data["score_cv"],
line_width=2, color="green", legend="score_cv")
# p.line(x=data["hidden_layer_sizes"], y=data["score_test"],
# line_width=2, color="blue", legend="score_test")
if not sgd_data is None:
p.circle(x=0, y=sgd_data[sgd_data["model"].str.contains("SGD")]["score_train"].median(), color="red")
p.circle(x=0, y=sgd_data[sgd_data["model"].str.contains("SGD")]["score_cv"].median(), color="green")
return p
range_y = (0.066, 0.0754)
p1 = plot_mlp_scores(linear_mlp, range_y, title="Linear MLP", sgd_data=results_df[results_df["model"].str.contains("linear")])
p2 = plot_mlp_scores(poli_mlp, range_y, title="Polynomial MLP", sgd_data=results_df[results_df["model"].str.contains("Polyn")])
grid = gridplot([[p1, p2]])
show(grid)